library(dplyr)
county = read.csv('us_county.csv') #Received from prof. D Ficklin from geology department
fatalities = read.csv('final_data.csv')
county = county %>% select(STATE_NAME,STATE_FIPS)
county = county[!duplicated(county$STATE_NAME),]
#fatalities = merge(fatalities,county,by.x = "OBJECT_ID",by.y = "STATE_FIPS",all.x =FALSE)

Exploratory Data Analysis:

Now we have the data in the desired form with the State_name and their associated values. Lets do some exploratory data analysis:

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
accidentFatalitiesState =  fatalities %>% group_by(STATE_NAME) %>% summarise(count = n())
ggplot(data.frame(table(fatalities$STATE_NAME))) + aes(y = Freq, x = reorder(Var1,Freq)) + geom_bar(stat = 'identity') + coord_flip() + ylab("Total Number of fatalities accident") + xlab("State Name") + ggtitle("Fatalities accident by State") + scale_y_continuous(breaks = seq(1,max(accidentFatalitiesState$count),by = 500)) +  theme_minimal() + theme(title = element_text(family = "Trebuchet MS", color="#422164", face="bold", size=15)) + theme(text = element_text(size=8,colour="#266164"))

It seems that Texas, has the highest number of fatalities accident in U.S. Also these states has higher number of population in U.S. Let’s normalize by the population.
#####Population data is collected from https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population

population = read.csv('population.csv')
accidentFatalitiesState = merge(accidentFatalitiesState,select(fatalities,OBJECT_ID,STATE_NAME),by='STATE_NAME')
accidentFatalitiesState = accidentFatalitiesState[!duplicated(accidentFatalitiesState$STATE_NAME),]
accidentFatalitiesState  = merge(accidentFatalitiesState,population,by = 'OBJECT_ID')
accidentFatalitiesState = mutate(accidentFatalitiesState,ratio = log2(count/Population))
ggplot(accidentFatalitiesState) + aes(y = ratio, x = reorder(STATE_NAME,ratio)) + coord_flip() + geom_bar(stat='identity') + labs(title = 'Ratio of accident fatalities to population',y = 'Log Ratio',x = 'State Name') + theme_minimal() + theme(title = element_text(family = "Trebuchet MS", color="#422164", face="bold", size=15)) + theme(text = element_text(size=8,colour="#266164"))

After transforming the ratio of “Accident with Fatalities to Population in the state” on log_2 scale, it seems that Wisconsin and texas has the highest proportion of accidents. Now let’s check for drowsy drivers.

library(reshape)
drowsyAccidents  = fatalities %>% group_by(A_DROWSY,STATE_NAME) %>% summarise(count = n())
drowsyAccidents =cast(drowsyAccidents,STATE_NAME ~ A_DROWSY)
colnames(drowsyAccidents) = c("STATE_NAME","DROWSY","NOT_DROWSY")
drowsyAccidents[is.na(drowsyAccidents)] = 0
ggplot(drowsyAccidents) + aes(x = reorder(STATE_NAME,DROWSY/(NOT_DROWSY+DROWSY)), y = DROWSY/(NOT_DROWSY+DROWSY)) + geom_bar(stat = 'identity') + coord_flip()+ labs(title = 'Percentage of Drowsy drivers involved in accidents State-wise',y = 'Drowsy Drivers/Accident Fatalities',x = 'State Name') + theme_minimal() + theme(title = element_text(family = "Trebuchet MS", color="#422164", face="bold", size=12)) + theme(text = element_text(size=8,colour="#266164"))

It seems that Vermont has 10%, the highest accident fatalities involving drowsy driver. It would be interesting to know if there are particular counties in vermont that have highest proportion of fatalities involving drowsy driver.

library(maps)
all_states = map_data("county")
vermont = subset(all_states,region == 'vermont')
base = ggplot() + geom_polygon(data=vermont,aes(x=long,y=lat,group=group),colour = "white",fill='#E4CCCC')
base + geom_point(data = filter(fatalities,STATE_NAME=='Vermont'),aes(x = LONGITUD,y = LATITUDE,colour = as.factor(A_DROWSY)))+ labs(title = 'Location Map for fatalities involving drowsy driver in Vermont',x="",y="",color="") + theme_minimal() + theme(title = element_text(family = "Trebuchet MS", color="#422164", face="bold", size=12),legend.title = element_text(color="red")) + theme(text = element_text(colour="#266164"),axis.text.x = element_blank(),axis.text.y = element_blank()) + scale_color_manual(labels = c("Drowsy Driver", "Non Drowsy Driver"), values = c("red", "blue"))

As the data is very low, we can’t infer if there counties affect drowsy fatalities. Let’s remove some outliers. Hence removing, wherever we have unknown and unreported data, because this can provide uninterpretation of our model.

boxplot(fatalities$PERSONS)

quantile(log2(fatalities$PERSONS),prob = (1:100)/100)
##       1%       2%       3%       4%       5%       6%       7%       8% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##       9%      10%      11%      12%      13%      14%      15%      16% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      17%      18%      19%      20%      21%      22%      23%      24% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      25%      26%      27%      28%      29%      30%      31%      32% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      33%      34%      35%      36%      37%      38%      39%      40% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      41%      42%      43%      44%      45%      46%      47%      48% 
## 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 
##      49%      50%      51%      52%      53%      54%      55%      56% 
## 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 
##      57%      58%      59%      60%      61%      62%      63%      64% 
## 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 
##      65%      66%      67%      68%      69%      70%      71%      72% 
## 1.000000 1.000000 1.000000 1.000000 1.584963 1.584963 1.584963 1.584963 
##      73%      74%      75%      76%      77%      78%      79%      80% 
## 1.584963 1.584963 1.584963 1.584963 1.584963 1.584963 1.584963 1.584963 
##      81%      82%      83%      84%      85%      86%      87%      88% 
## 1.584963 1.584963 1.584963 2.000000 2.000000 2.000000 2.000000 2.000000 
##      89%      90%      91%      92%      93%      94%      95%      96% 
## 2.000000 2.000000 2.321928 2.321928 2.321928 2.321928 2.584963 2.584963 
##      97%      98%      99%     100% 
## 2.584963 2.807355 3.000000 6.539159
quantile(log2(fatalities$PERMVIT),prob = (1:100)/100)
##       1%       2%       3%       4%       5%       6%       7%       8% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##       9%      10%      11%      12%      13%      14%      15%      16% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      17%      18%      19%      20%      21%      22%      23%      24% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      25%      26%      27%      28%      29%      30%      31%      32% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      33%      34%      35%      36%      37%      38%      39%      40% 
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
##      41%      42%      43%      44%      45%      46%      47%      48% 
## 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 
##      49%      50%      51%      52%      53%      54%      55%      56% 
## 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 
##      57%      58%      59%      60%      61%      62%      63%      64% 
## 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 
##      65%      66%      67%      68%      69%      70%      71%      72% 
## 1.000000 1.000000 1.000000 1.000000 1.000000 1.584963 1.584963 1.584963 
##      73%      74%      75%      76%      77%      78%      79%      80% 
## 1.584963 1.584963 1.584963 1.584963 1.584963 1.584963 1.584963 1.584963 
##      81%      82%      83%      84%      85%      86%      87%      88% 
## 1.584963 1.584963 1.584963 2.000000 2.000000 2.000000 2.000000 2.000000 
##      89%      90%      91%      92%      93%      94%      95%      96% 
## 2.000000 2.000000 2.000000 2.321928 2.321928 2.321928 2.584963 2.584963 
##      97%      98%      99%     100% 
## 2.584963 2.807355 3.000000 6.539159
fatalities = subset(fatalities,A_INTER != 3)
fatalities = subset(fatalities, A_TOD != 3)
fatalities = subset(fatalities, drunkDr %in% c(0,1))
fatalities = subset(fatalities, WEATHER %in% c(1,2,3,4,5,6,7,8,10,11,12))
fatalities = subset(fatalities, HOUR != 99)
fatalities = subset(fatalities, A_DOW != 3)
fatalities = subset(fatalities, TYP_INT %in% c(1,2,3,4,5,6,7,8,9,10))
fatalities = subset(fatalities, REL_ROAD %in% c(1,2,3,4,5,6,7,8,10,11))
fatalities = subset(fatalities, REL_ROAD %in% c(1,2,3,4,5,6))
fatalities = filter(fatalities,!LGT_COND %in% c(7,8,9))
ggplot(fatalities,aes(x = factor(STATE_NAME),y=FATALS)) + geom_boxplot(aes(colour = factor(STATE_NAME)),outlier.colour = "red", outlier.size = 1,) + coord_flip() +geom_jitter(size=0.01,alpha=0.05) + labs(title="Fatalities by State (BoxPlot)", x = "State Name", y="Total Fatalities")

fatalities = subset(fatalities, FATALS %in% c(1,2,3,4))
fatalities = subset(fatalities, A_RU != 3)
fatalities = subset(fatalities, A_SPCRA != 3)
fatalities = subset(fatalities, (PERSONS < 9) & (PERSONS > 0) & (PERSONS == PERMVIT) & (FATALS <= PERSONS))
fatalities$PERSONS = NULL
fatalities = subset(fatalities, ROUTE != 9)
fatalities = subset(fatalities, SP_JUR != 9)
fatalities = filter(fatalities, !RELJCT1 %in% c(8,9), !RELJCT2 %in% c(98,99))
fatalities = filter(fatalities, WRK_ZONE != 3)
fatalities = filter(fatalities, !HARM_EV %in% c(6,72,99,73,45,51,2,50,49,20,44,40,48,54,16,3,7,46,41,26,19,57,17,39,58,10,21,23,99))

fatalities$RELJCT1 = NULL

It seems that fatalities are not affected by state. Now let’s do some hypothesis testing. We assume the following hypothesis:
1. Drunken Driver is independent of Age group 15 - 20
2. Drunken Driver is independent of Age group 21-24
3. Drunken Driver is independent of Age group greater than 65+
4. Does the above age group affects Drunken Driver

ageDrunkFat = rbind(table(fatalities$A_D21_24,fatalities$drunkDr)[1,],table(fatalities$A_D15_20,fatalities$drunkDr)[1,], table(fatalities$A_D65PLS,fatalities$drunkDr)[1,])
chisq.test(ageDrunkFat)
## 
##  Pearson's Chi-squared test
## 
## data:  ageDrunkFat
## X-squared = 645.78, df = 2, p-value < 2.2e-16

As, P-value is less than significance value, hence we can say that drunker driver fatalities are not independent of Age of the Person. This means, that age does affect drunken driving. Lets check how weather affects accidents

library(heatmaply)
wea = sort(table(fatalities$WEATHER))
print(wea/sum(wea))
## 
##            7            6           11           12            8 
## 0.0001877229 0.0007508917 0.0007508917 0.0007884363 0.0018772292 
##            3            4            5            2           10 
## 0.0033414680 0.0099117702 0.0129904261 0.0781678243 0.1716913835 
##            1 
## 0.7195419561
wea = data.frame(table(fatalities$WEATHER,"STATE_NAME" = fatalities$STATE_NAME))
wea = cast(wea, STATE_NAME ~Var1)
colnames(wea) = c("STATE_NAME","CLEAR", "RAIN","HAIL","SNOW","FOG","HEAVY WIND","SAND", "OTHER","COUDY","BLOWING SNOW","FREEZING RAIN")
row.names(wea) = wea$STATE_NAME
heatmaply((sapply(select(wea,-c(STATE_NAME,CLEAR)),as.numeric)),labRow = row.names(wea),scale = "row",margins = c(100,150))

Heatmaps have cells aranged as per the fatalities in weather with scaled data. It’s obvious that Clear weather would be having Larger amount of accidents, however it is interesting to see that after Clear weather, Cloudy has the highest accidents and then Rainy. We can also see that california and texas are clustered toghether and its well reasoned that both states share approximately similar weather conditions, fatalities and population.
##Explore with Hierarchiacl clustering and K-Modes clustering:

fatalities = select(fatalities, -c(X, ST_CASE, OBJECT_ID, FID, LATITUDE,LONGITUD,STATE_NAME,DAY_WEEK,HOUR))
melt_cor = (round(cor(fatalities),7))
#melt_cor[lower.tri(melt_cor)] <- NA
melt_cor = melt(melt_cor,na.rm=TRUE)
ggplot(data = melt_cor, aes(X1, X2, fill = value))+geom_tile(color = "white")+labs(title="Correlation matrix for variables")+scale_fill_gradient2(low = "blue", high = "red", mi= "white",  midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +theme_minimal()+ theme(axis.text.x =element_text(angle= 45, vjust = 1, size = 7, hjust = 1),axis.title = element_blank())+coord_fixed()

It seems that Types of crashes such as one car crash, two car crash are correlated to Number of people fatalities, and types of event. Also Positive check of BAC and drunken driver are correlated, hence we can think to reomve some correalted variables, as correlation won’t reduce the power of prediction, as correlated variables does not mean they are dependent.
Now, to see how many similar types of Fatalities are involved we can do K-means. As we have categorical data we will evaluate the results based upon K-means(one-hot encoding) and K-modes.

write.csv(fatalities,'filtered_fatalities.csv')